########################################################
## PROGRAM NAME: 010_misc.R                           ##
## AUTHOR: MATT MLECZKO                               ##
## DATE CREATED: 05/05/2021                           ##
## INPUTS:                                            ##
##    002_af_wide_final.Rda                           ##
##                                                    ##
## OUTPUTS:                                           ##
##                                                    ## 
## PURPOSE: Misc analyses for:                        ##
#              Table 2                                ##
##             Appendix Table 1                       ##
##                                                    ##
## LIST OF UPDATES:                                   ##
##                                                    ##
########################################################

log <- file("010_misc.txt") 
sink(log, append=TRUE)
sink(log, append=TRUE, type="message")

set.seed(08540)

## load libraries ##

library(tidycensus)
library(tidyverse)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(gsubfn)
library(readxl)
library(stargazer)
library(ggplot2)
library(haven)
library(gridExtra)

## define paths ##
data_path <- "PATH TO DATA HERE"
progs <- "PATH TO PROGRAMS HERE"

## set working directory ##
setwd(data_path)

##
## load necessary functions ## 
##

`%notin%` <- Negate(`%in%`)
source(paste0(progs,"stata_merge.R"))
source(paste0(progs,"hud_group_zip.R"))
source(paste0(progs,"rd_data_zip.R"))

## load the analytic file ##
load("002_af_wide_final.Rda")

##
## stats for table 2 ## 
##

prop.table(table(af.wide.final$ld_jt_2000, useNA = "ifany"))
prop.table(table(af.wide.final$ld_jt_2010, useNA = "ifany"))
prop.table(table(af.wide.final$ld_jt_2020, useNA = "ifany"))

prop.table(table(af.wide.final$md_jt_2000, useNA = "ifany"))
prop.table(table(af.wide.final$md_jt_2010, useNA = "ifany"))
prop.table(table(af.wide.final$md_jt_2020, useNA = "ifany"))

prop.table(table(af.wide.final$hd_jt_2000, useNA = "ifany"))
prop.table(table(af.wide.final$hd_jt_2010, useNA = "ifany"))
prop.table(table(af.wide.final$hd_jt_2020, useNA = "ifany"))

##
## create appendix table 1 ##
##

## trt entropy ## 

summary(af.wide.final$trt_entropy_ers_2000)
sd(af.wide.final$trt_entropy_ers_2000)

summary(af.wide.final$trt_entropy_ers_2010)
sd(af.wide.final$trt_entropy_ers_2010)

summary(af.wide.final$trt_entropy_ers_2020)
sd(af.wide.final$trt_entropy_ers_2020)

summary(af.wide.final$trt_entropy_sess_2000)
sd(af.wide.final$trt_entropy_sess_2000)

summary(af.wide.final$trt_entropy_sess_2010)
sd(af.wide.final$trt_entropy_sess_2010)

summary(af.wide.final$trt_entropy_sess_2020)
sd(af.wide.final$trt_entropy_sess_2020, na.rm=T)

summary(af.wide.final$trt_entropy_joints_2000)
sd(af.wide.final$trt_entropy_joints_2000)

summary(af.wide.final$trt_entropy_joints_2010)
sd(af.wide.final$trt_entropy_joints_2010)

summary(af.wide.final$trt_entropy_joints_2020)
sd(af.wide.final$trt_entropy_joints_2020)

## msa entropy ##

summary(af.wide.final$msa_entropy_ers_2000)
sd(af.wide.final$msa_entropy_ers_2000)

summary(af.wide.final$msa_entropy_ers_2010)
sd(af.wide.final$msa_entropy_ers_2010)

summary(af.wide.final$msa_entropy_ers_2020)
sd(af.wide.final$msa_entropy_ers_2020)

## msa racial/ethnic segregation ##

summary(af.wide.final$msa_D_2000)
sd(af.wide.final$msa_D_2000)

summary(af.wide.final$msa_D_2010)
sd(af.wide.final$msa_D_2010)

summary(af.wide.final$msa_D_2020)
sd(af.wide.final$msa_D_2020)

## msa income segregation ##

summary(af.wide.final$msa_H4_adj_2000)
sd(af.wide.final$msa_H4_adj_2000)

summary(af.wide.final$msa_H4_adj_2010)
sd(af.wide.final$msa_H4_adj_2010)

summary(af.wide.final$msa_H4_adj_2020)
sd(af.wide.final$msa_H4_adj_2020, na.rm=T)

##
## dig into trends a bit more 
##

## what is driving the int white low income neighborhoods trend -> neighborhoods that become more ethnoracially integrated ## 

int.wl <- af.wide.final %>%
  filter(int_jt_relf_a_2020 == 1 & 
         maj_race_2020 == "per_white" & 
         maj_ses_2020 == "linc") %>%
  mutate(maj_race_ses_2000 = paste0(maj_race_2000,
                                    maj_ses_2000),
         maj_race_ses_2010 = paste0(maj_race_2010,
                                    maj_ses_2010),
         maj_race_ses_2020 = paste0(maj_race_2020,
                                    maj_ses_2020),
         eth_int_traj = paste0(int_er_relf_2000,
                               int_er_relf_2010,
                               int_er_relf_2020),
         ses_int_traj = paste0(int_ses_rel_a_2000,
                               int_ses_rel_a_2010,
                               int_ses_rel_a_2020),
         ov_int_traj = paste0(int_jt_relf_a_2000,
                              int_jt_relf_a_2010,
                              int_jt_relf_a_2020)) %>%
  group_by(maj_race_ses_2000,
           maj_race_ses_2010,
           maj_race_ses_2020,
           eth_int_traj,
           ses_int_traj,
           ov_int_traj) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  mutate(share = n/sum(n))

top10 <- int.wl %>%
  arrange(desc(share)) %>%
  slice(1:10) 

sum(top10$share)

sum(int.wl$share[int.wl$eth_int_traj == "111" & int.wl$ses_int_traj == "111"]) - 0.26
sum(int.wl$share[int.wl$eth_int_traj == "111" & int.wl$ses_int_traj != "111"])
sum(int.wl$share[int.wl$eth_int_traj != "111" & int.wl$ses_int_traj == "111"])


int.wl.full <- af.wide.final %>%
  filter(int_jt_relf_a_2020 == 1 & 
         maj_race_2020 == "per_white" & 
         maj_ses_2020 == "linc")

prop.table(table(int.wl.full$maj_race_2000))
prop.table(table(int.wl.full$maj_race_2010))
prop.table(table(int.wl.full$maj_race_2020))

summary(int.wl.full$per_fb_2000)
summary(af.wide.final$per_fb_2000)

summary(int.wl.full$per_fb_2010)
summary(af.wide.final$per_fb_2010)

summary(int.wl.full$per_fb_2020)
summary(af.wide.final$per_fb_2020)

summary(int.wl.full$per_asian_2000)
summary(int.wl.full$per_asian_2010)
summary(int.wl.full$per_asian_2020)

summary(int.wl.full$per_black_2000)
summary(int.wl.full$per_black_2010)
summary(int.wl.full$per_black_2020)

summary(int.wl.full$per_hl_2000)
summary(int.wl.full$per_hl_2010)
summary(int.wl.full$per_hl_2020)

summary(int.wl.full$per_aian_2000)
summary(int.wl.full$per_aian_2010)
summary(int.wl.full$per_aian_2020)

summary(int.wl.full$per_other_2000)
summary(int.wl.full$per_other_2010)
summary(int.wl.full$per_other_2020)

summary(int.wl.full$per_white_2000)
summary(int.wl.full$per_white_2010)
summary(int.wl.full$per_white_2020)

##
## what is driving the int white high income neighborhoods trend ## 
##

int.wh <- af.wide.final %>%
  filter(int_jt_relf_a_2020 == 1 & 
           maj_race_2020 == "per_white" & 
           maj_ses_2020 == "hinc") %>%
  mutate(maj_race_ses_2000 = paste0(maj_race_2000,
                                    maj_ses_2000),
         maj_race_ses_2010 = paste0(maj_race_2010,
                                    maj_ses_2010),
         maj_race_ses_2020 = paste0(maj_race_2020,
                                    maj_ses_2020),
         eth_int_traj = paste0(int_er_relf_2000,
                               int_er_relf_2010,
                               int_er_relf_2020),
         ses_int_traj = paste0(int_ses_rel_a_2000,
                               int_ses_rel_a_2010,
                               int_ses_rel_a_2020),
         ov_int_traj = paste0(int_jt_relf_a_2000,
                              int_jt_relf_a_2010,
                              int_jt_relf_a_2020)) %>%
  group_by(maj_race_ses_2000,
           maj_race_ses_2010,
           maj_race_ses_2020,
           eth_int_traj,
           ses_int_traj,
           ov_int_traj) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  mutate(share = n/sum(n))

top10 <- int.wh %>%
  arrange(desc(share)) %>%
  slice(1:10) 

sum(top10$share)

sum(int.wh$share[int.wh$eth_int_traj == "111" & int.wh$ses_int_traj == "111"]) - 0.19
sum(int.wh$share[int.wh$eth_int_traj == "111" & int.wh$ses_int_traj != "111"])
sum(int.wh$share[int.wh$eth_int_traj != "111" & int.wh$ses_int_traj == "111"])



int.wh.full <- af.wide.final %>%
  filter(int_jt_relf_a_2020 == 1 & 
           maj_race_2020 == "per_white" & 
           maj_ses_2020 == "hinc")

prop.table(table(int.wh.full$maj_race_2000))
prop.table(table(int.wh.full$maj_race_2010))
prop.table(table(int.wh.full$maj_race_2020))

summary(int.wh.full$per_fb_2000)
summary(af.wide.final$per_fb_2000)

summary(int.wh.full$per_fb_2010)
summary(af.wide.final$per_fb_2010)

summary(int.wh.full$per_fb_2020)
summary(af.wide.final$per_fb_2020)

summary(int.wh.full$per_asian_2000)
summary(int.wh.full$per_asian_2010)
summary(int.wh.full$per_asian_2020)

summary(int.wh.full$per_black_2000)
summary(int.wh.full$per_black_2010)
summary(int.wh.full$per_black_2020)

summary(int.wh.full$per_hl_2000)
summary(int.wh.full$per_hl_2010)
summary(int.wh.full$per_hl_2020)

summary(int.wh.full$per_aian_2000)
summary(int.wh.full$per_aian_2010)
summary(int.wh.full$per_aian_2020)

summary(int.wh.full$per_other_2000)
summary(int.wh.full$per_other_2010)
summary(int.wh.full$per_other_2020)

summary(int.wh.full$per_white_2000)
summary(int.wh.full$per_white_2010)
summary(int.wh.full$per_white_2020)

##
## what is driving the int hispanic/latino neighborhoods trend ## 
##

## roughly 50% were formerly white neighborhoods ##
## roughly 23% were always integrated low income Hispanic/Latino ## 
## the rest (~27%) ##

int.hl <- af.wide.final %>%
  filter(int_jt_relf_a_2020 == 1 & 
           maj_race_2020 == "per_hl" & 
           maj_ses_2020 == "linc") %>%
  mutate(maj_race_ses_2000 = paste0(maj_race_2000,
                                    maj_ses_2000),
         maj_race_ses_2010 = paste0(maj_race_2010,
                                    maj_ses_2010),
         maj_race_ses_2020 = paste0(maj_race_2020,
                                    maj_ses_2020),
         eth_int_traj = paste0(int_er_relf_2000,
                               int_er_relf_2010,
                               int_er_relf_2020),
         ses_int_traj = paste0(int_ses_rel_a_2000,
                               int_ses_rel_a_2010,
                               int_ses_rel_a_2020),
         ov_int_traj = paste0(int_jt_relf_a_2000,
                              int_jt_relf_a_2010,
                              int_jt_relf_a_2020)) %>%
  group_by(maj_race_ses_2000,
           maj_race_ses_2010,
           maj_race_ses_2020,
           eth_int_traj,
           ses_int_traj,
           ov_int_traj) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  mutate(share = n/sum(n))

int.hl.fwall <- int.hl %>%
  filter(grepl("per_white", maj_race_ses_2000) & 
           grepl("per_white", maj_race_ses_2010))

sum(int.hl.fwall$share)

int.hl.fwone <- int.hl %>%
  filter(grepl("per_white", maj_race_ses_2000) & 
           !grepl("per_white", maj_race_ses_2010))

sum(int.hl.fwone$share)

wh.t.hl <- af.wide.final %>%
  filter(maj_race_2000 == "per_white" & 
           maj_race_2020 == "per_hl" & 
           int_jt_relf_a_2020 == 1 & 
           maj_ses_2020 == "linc")

summary(wh.t.hl$per_fb_2000)
summary(wh.t.hl$per_fb_2010)
summary(wh.t.hl$per_fb_2020)

summary(wh.t.hl$per_asian_2000)
summary(wh.t.hl$per_black_2000)
summary(wh.t.hl$per_hl_2000)
summary(wh.t.hl$per_white_2000)

summary(wh.t.hl$per_asian_2010)
summary(wh.t.hl$per_black_2010)
summary(wh.t.hl$per_hl_2010)
summary(wh.t.hl$per_white_2010)

summary(wh.t.hl$per_asian_2020)
summary(wh.t.hl$per_black_2020)
summary(wh.t.hl$per_hl_2020)
summary(wh.t.hl$per_white_2020)


hl.t.hl <- af.wide.final %>%
  filter(int_jt_relf_a_2020 == 1 & 
           maj_race_2020 == "per_hl" & 
           maj_ses_2020 == "linc" & 
           GEOID %notin% wh.t.hl$GEOID) %>%
  mutate(maj_race_ses_2000 = paste0(maj_race_2000,
                                    maj_ses_2000),
         maj_race_ses_2010 = paste0(maj_race_2010,
                                    maj_ses_2010),
         maj_race_ses_2020 = paste0(maj_race_2020,
                                    maj_ses_2020),
         eth_int_traj = paste0(int_er_relf_2000,
                               int_er_relf_2010,
                               int_er_relf_2020),
         ses_int_traj = paste0(int_ses_rel_a_2000,
                               int_ses_rel_a_2010,
                               int_ses_rel_a_2020),
         ov_int_traj = paste0(int_jt_relf_a_2000,
                              int_jt_relf_a_2010,
                              int_jt_relf_a_2020)) %>%
  filter(!(maj_race_ses_2000 == "per_hllinc" & 
             maj_race_ses_2010 == "per_hllinc" & 
             maj_race_ses_2020 == "per_hllinc" &
             ov_int_traj == "111")) %>%
  group_by(maj_race_ses_2000,
           maj_race_ses_2010,
           maj_race_ses_2020,
           eth_int_traj,
           ses_int_traj,
           ov_int_traj) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  mutate(share = n/sum(n))

sum(hl.t.hl$share[hl.t.hl$ov_int_traj == "111"])
sum(hl.t.hl$share[hl.t.hl$eth_int_traj == "111" & hl.t.hl$ses_int_traj != "111"])
sum(hl.t.hl$share[hl.t.hl$eth_int_traj != "111" & hl.t.hl$ses_int_traj == "111"])

##
## what's driving the low-income stable integration finding ##
##

stab.int <- af.wide.final %>%
  filter(int_jt_relf_a_2000 == 1 & 
           int_jt_relf_a_2010 == 1 & 
           int_jt_relf_a_2020 == 1)

prop.table(table(stab.int$maj_ses_2000))
prop.table(table(stab.int$maj_ses_2010))
prop.table(table(stab.int$maj_ses_2020))

##
## scatter plot of entropy and ethnoracial shares ##
##

p1 <- ggplot(af.wide.final, 
             aes(x=per_asian_2020, 
                 y=trt_entropy_ers_2020)) + 
  geom_point() + 
  xlab("Percent Asian") + 
  ylab("Scaled joint entropy") + 
  theme_minimal()


p2 <- ggplot(af.wide.final, 
             aes(x=per_black_2020, 
                 y=trt_entropy_ers_2020)) + 
  geom_point() + 
  xlab("Percent Black") + 
  ylab("Scaled joint entropy") + 
  theme_minimal()

p3 <- ggplot(af.wide.final, 
             aes(x=per_hl_2020, 
                 y=trt_entropy_ers_2020)) + 
  geom_point() + 
  xlab("Percent Hispanic/Latino") + 
  ylab("Scaled joint entropy") + 
  theme_minimal()

p4 <- ggplot(af.wide.final, 
             aes(x=per_white_2020, 
                 y=trt_entropy_ers_2020)) + 
  geom_point() + 
  xlab("Percent White") + 
  ylab("Scaled joint entropy") + 
  theme_minimal()

grid.arrange(p1,p2,p3,p4, nrow=2)

## scatter plot of entropy and ses shares ##

p1.ses <- ggplot(af.wide.final, 
                 aes(x=linc_2020, 
                     y=trt_entropy_ers_2020)) + 
  geom_point() + 
  xlab("Percent Low-Income") + 
  ylab("Scaled joint entropy") + 
  theme_minimal()


p2.ses <- ggplot(af.wide.final, 
                 aes(x=minc_2020, 
                     y=trt_entropy_ers_2020)) + 
  geom_point() + 
  xlab("Percent Moderate-Income") + 
  ylab("Scaled joint entropy") + 
  theme_minimal()

p3.ses <- ggplot(af.wide.final, 
                 aes(x=hinc_2020, 
                     y=trt_entropy_ers_2020)) + 
  geom_point() + 
  xlab("Percent High-Income") + 
  ylab("Scaled joint entropy") + 
  theme_minimal()

grid.arrange(p1.ses,p2.ses,p3.ses, nrow=2)

### END OF PROGRAM ###

sink()
sink(type = "message")
